Communication

  • For Niraj TE
  • Mut vs WT
  • Batch: SE and PE
  • Combine Regular Genes Expression (RNAseq data) with TE expression (better normalized expression)

Notes:

Code LOG:

  • included annotation
  • included merging
  • used TPM normalization
  • save seperate results into excel rather than csv to prevent gene name bug in excel
  • print cut-offs
  • export results in original order, not ordered by LFC nor FDR (more robust)
  • export significant results, too (FDR and |LFC|, not including edges)
  • better VolcanoPlot by setting Max Y to 50 (>50 set to 50)
  • better html output, skip nb, use html
  • both OneBigModel and SeperatedModels
  • fixed row.names (gencode id) bug
  • output both FPKM and TPM, 06/28/2019
  • both shrinked LFC and raw LFC in same output Excel, and both plots
  • add lib-size
  • Pvalue -> FDR in Volcano-plot, ylim 0,50, xlim fixing, smaller dots
  • optional function to aggregate multi-loci genes (e.g. his1, his2a, his2b, his3, his4)
  • Use gray scale for MA/Volcano-plots
  • Output COUNT.xlsx
  • volcano plot: up/down count
  • length in TPM and DEseq2 outputs after aggregating histone genes
  • Reverse SampleNAme:RPKM
  • hist log(count+1)
  • added optinal density scatterplot between length and |LFC|
  • fixed sig.xlsx index bug

Read Data

df1 <- read.table('../featureCount/counts.gene_id.run1.pe.txt', 
                 sep="\t", header=TRUE, 
                 row.names = 1) # row.name in cts(matrix)
colnames(df1)
## [1] "Chr"                                      
## [2] "Start"                                    
## [3] "End"                                      
## [4] "Strand"                                   
## [5] "Length"                                   
## [6] "...bam.run2_local.DelMut.PE.run2.bam"     
## [7] "...bam.run2_local.FRT19A.CTRL.PE.run2.bam"
## [8] "...bam.run2_local.PointMut.PE.run2.bam"   
## [9] "...bam.run2_local.W1118.CTRL.PE.run2.bam"
dim(df1)
## [1] 175   9
df2 <- read.table('../featureCount/counts.gene_id.run1.se.txt', 
                 sep="\t", header=TRUE, 
                 row.names = 1) # row.name in cts(matrix)
colnames(df2)
## [1] "Chr"                                      
## [2] "Start"                                    
## [3] "End"                                      
## [4] "Strand"                                   
## [5] "Length"                                   
## [6] "...bam.run2_local.DelMut.SE.run2.bam"     
## [7] "...bam.run2_local.FRT19A.CTRL.SE.run2.bam"
## [8] "...bam.run2_local.PointMut.SE.run2.bam"   
## [9] "...bam.run2_local.W1118.CTRL.SE.run2.bam"
dim(df2)
## [1] 175   9
df <- merge(df1, df2, by=c(1,2,3,4,5))
row.names(df) <- df$Chr
head(df)
##                               Chr Start  End Strand Length
## AY187768_BUT6       AY187768_BUT6     1  387      +    387
## AY313771_ISBU3     AY313771_ISBU3     1  993      +    993
## FBgn0000004_17.6 FBgn0000004_17.6     1 7439      +   7439
## FBgn0000005_297   FBgn0000005_297     1 6995      +   6995
## FBgn0000006_412   FBgn0000006_412     1 7567      +   7567
## FBgn0000007_1731 FBgn0000007_1731     1 4648      +   4648
##                  ...bam.run2_local.DelMut.PE.run2.bam
## AY187768_BUT6                                       4
## AY313771_ISBU3                                     15
## FBgn0000004_17.6                                 5945
## FBgn0000005_297                                  9477
## FBgn0000006_412                                 42344
## FBgn0000007_1731                                70047
##                  ...bam.run2_local.FRT19A.CTRL.PE.run2.bam
## AY187768_BUT6                                            3
## AY313771_ISBU3                                           4
## FBgn0000004_17.6                                       845
## FBgn0000005_297                                       6271
## FBgn0000006_412                                      69945
## FBgn0000007_1731                                      2326
##                  ...bam.run2_local.PointMut.PE.run2.bam
## AY187768_BUT6                                         6
## AY313771_ISBU3                                       13
## FBgn0000004_17.6                                   5921
## FBgn0000005_297                                   10445
## FBgn0000006_412                                   33547
## FBgn0000007_1731                                  60339
##                  ...bam.run2_local.W1118.CTRL.PE.run2.bam
## AY187768_BUT6                                           0
## AY313771_ISBU3                                         11
## FBgn0000004_17.6                                     3732
## FBgn0000005_297                                      7325
## FBgn0000006_412                                     57063
## FBgn0000007_1731                                     1413
##                  ...bam.run2_local.DelMut.SE.run2.bam
## AY187768_BUT6                                      21
## AY313771_ISBU3                                      0
## FBgn0000004_17.6                                 6395
## FBgn0000005_297                                  5765
## FBgn0000006_412                                 16510
## FBgn0000007_1731                                17574
##                  ...bam.run2_local.FRT19A.CTRL.SE.run2.bam
## AY187768_BUT6                                           33
## AY313771_ISBU3                                           0
## FBgn0000004_17.6                                       618
## FBgn0000005_297                                       3622
## FBgn0000006_412                                      22950
## FBgn0000007_1731                                      1118
##                  ...bam.run2_local.PointMut.SE.run2.bam
## AY187768_BUT6                                        19
## AY313771_ISBU3                                        0
## FBgn0000004_17.6                                   4149
## FBgn0000005_297                                    8894
## FBgn0000006_412                                   13598
## FBgn0000007_1731                                  16304
##                  ...bam.run2_local.W1118.CTRL.SE.run2.bam
## AY187768_BUT6                                           2
## AY313771_ISBU3                                          0
## FBgn0000004_17.6                                     3853
## FBgn0000005_297                                     11389
## FBgn0000006_412                                     17296
## FBgn0000007_1731                                      686
colnames(df)
##  [1] "Chr"                                      
##  [2] "Start"                                    
##  [3] "End"                                      
##  [4] "Strand"                                   
##  [5] "Length"                                   
##  [6] "...bam.run2_local.DelMut.PE.run2.bam"     
##  [7] "...bam.run2_local.FRT19A.CTRL.PE.run2.bam"
##  [8] "...bam.run2_local.PointMut.PE.run2.bam"   
##  [9] "...bam.run2_local.W1118.CTRL.PE.run2.bam" 
## [10] "...bam.run2_local.DelMut.SE.run2.bam"     
## [11] "...bam.run2_local.FRT19A.CTRL.SE.run2.bam"
## [12] "...bam.run2_local.PointMut.SE.run2.bam"   
## [13] "...bam.run2_local.W1118.CTRL.SE.run2.bam"
dim(df)
## [1] 175  13

Clean Up Data

# clean names
colnames(df) <- gsub("\\.\\.\\.bam.run2_local.", "", colnames(df))
colnames(df) <- gsub(".run2.bam", "", colnames(df))
colnames(df) <- gsub("sorted_reads.", "", colnames(df))
# colname_formatted <- c(
#     "KO_D0_3", "WT_D0_3", "KO_D2_3", "WT_D2_3", "KO_D8_3", "WT_D8_3",
#     "KO_D0_1", "KO_D0_2", "KO_D2_1", "KO_D2_2", "KO_D8_1", "KO_D8_2", 
#     "WT_D0_1", "WT_D0_2", "WT_D2_1", "WT_D2_2", "WT_D8_1", "WT_D8_2")
# paste(colnames(df), colname_formatted, sep = "->")
# colnames(df) <- colname_formatted
df<-df[,c( "Chr","Start","End","Strand","Length",
           "W1118.CTRL.PE","W1118.CTRL.SE",
          "FRT19A.CTRL.PE", "FRT19A.CTRL.SE",
          "DelMut.PE", "DelMut.SE",
          "PointMut.PE"   , "PointMut.SE" )]
colnames(df)
##  [1] "Chr"            "Start"          "End"            "Strand"        
##  [5] "Length"         "W1118.CTRL.PE"  "W1118.CTRL.SE"  "FRT19A.CTRL.PE"
##  [9] "FRT19A.CTRL.SE" "DelMut.PE"      "DelMut.SE"      "PointMut.PE"   
## [13] "PointMut.SE"
head(df)
##                               Chr Start  End Strand Length W1118.CTRL.PE
## AY187768_BUT6       AY187768_BUT6     1  387      +    387             0
## AY313771_ISBU3     AY313771_ISBU3     1  993      +    993            11
## FBgn0000004_17.6 FBgn0000004_17.6     1 7439      +   7439          3732
## FBgn0000005_297   FBgn0000005_297     1 6995      +   6995          7325
## FBgn0000006_412   FBgn0000006_412     1 7567      +   7567         57063
## FBgn0000007_1731 FBgn0000007_1731     1 4648      +   4648          1413
##                  W1118.CTRL.SE FRT19A.CTRL.PE FRT19A.CTRL.SE DelMut.PE
## AY187768_BUT6                2              3             33         4
## AY313771_ISBU3               0              4              0        15
## FBgn0000004_17.6          3853            845            618      5945
## FBgn0000005_297          11389           6271           3622      9477
## FBgn0000006_412          17296          69945          22950     42344
## FBgn0000007_1731           686           2326           1118     70047
##                  DelMut.SE PointMut.PE PointMut.SE
## AY187768_BUT6           21           6          19
## AY313771_ISBU3           0          13           0
## FBgn0000004_17.6      6395        5921        4149
## FBgn0000005_297       5765       10445        8894
## FBgn0000006_412      16510       33547       13598
## FBgn0000007_1731     17574       60339       16304
dim(df)
## [1] 175  13

Create cts

cts <- df[, 6:ncol(df)]
cts <- as.matrix(cts)
colnames(cts)
## [1] "W1118.CTRL.PE"  "W1118.CTRL.SE"  "FRT19A.CTRL.PE" "FRT19A.CTRL.SE"
## [5] "DelMut.PE"      "DelMut.SE"      "PointMut.PE"    "PointMut.SE"
head(cts)
##                  W1118.CTRL.PE W1118.CTRL.SE FRT19A.CTRL.PE FRT19A.CTRL.SE
## AY187768_BUT6                0             2              3             33
## AY313771_ISBU3              11             0              4              0
## FBgn0000004_17.6          3732          3853            845            618
## FBgn0000005_297           7325         11389           6271           3622
## FBgn0000006_412          57063         17296          69945          22950
## FBgn0000007_1731          1413           686           2326           1118
##                  DelMut.PE DelMut.SE PointMut.PE PointMut.SE
## AY187768_BUT6            4        21           6          19
## AY313771_ISBU3          15         0          13           0
## FBgn0000004_17.6      5945      6395        5921        4149
## FBgn0000005_297       9477      5765       10445        8894
## FBgn0000006_412      42344     16510       33547       13598
## FBgn0000007_1731     70047     17574       60339       16304
paste("lib-size:")
## [1] "lib-size:"
colSums(cts)
##  W1118.CTRL.PE  W1118.CTRL.SE FRT19A.CTRL.PE FRT19A.CTRL.SE      DelMut.PE 
##        1633796         795150         591846         243766        4645652 
##      DelMut.SE    PointMut.PE    PointMut.SE 
##        2908432        4595310        2819218

Filter Based on Expression

expression_filter <- rowSums(cts) >= 1  # default 10
cts <- cts[expression_filter, ]
df <- df[expression_filter, ]
dim(cts)
## [1] 159   8
dim(df)
## [1] 159  13

Read Anno

anno <- read.table(
"../../TEdata/Drosophila_melanogaster.BDGP6.22.96.CombineWith.dm6.transposon.anno.txt.gz",
    header=T)
colnames(anno)
## [1] "Gene"   "Name"   "Type"   "Length"
#anno <- anno[, c("peak_id", "seqnames", "start", "end", "width",  "geneBiotype", "geneID", "geneName" )]
dim(anno)
## [1] 17933     4
head(anno)
##          Gene           Name           Type Length
## 1 FBgn0000003 7SLRNA:CR32864          ncRNA    299
## 2 FBgn0000008              a protein_coding   5414
## 3 FBgn0000014          abd-A protein_coding   5477
## 4 FBgn0000015          Abd-B protein_coding  11791
## 5 FBgn0000017            Abl protein_coding  12819
## 6 FBgn0000018            abo protein_coding   1794

COUNT.TE output

count <- data.frame(cts)
colnames(count) <- paste(colnames(count),"COUNT", sep = ":")
count_out <- merge(anno, count, by.x=1, by.y=0, all.y=T, sort=F)
head(count_out)
##                 Gene               Name           Type Length
## 1 FBgn0026065_Idefix FBgn0026065_Idefix dm6.transposon   7411
## 2   FBgn0000004_17.6   FBgn0000004_17.6 dm6.transposon   7439
## 3   FBgn0000007_1731   FBgn0000007_1731 dm6.transposon   4648
## 4    FBgn0000005_297    FBgn0000005_297 dm6.transposon   6995
## 5   FBgn0005384_3S18   FBgn0005384_3S18 dm6.transposon   6126
## 6    FBgn0000006_412    FBgn0000006_412 dm6.transposon   7567
##   W1118.CTRL.PE:COUNT W1118.CTRL.SE:COUNT FRT19A.CTRL.PE:COUNT
## 1                2640                 533                 2231
## 2                3732                3853                  845
## 3                1413                 686                 2326
## 4                7325               11389                 6271
## 5               10501                6263                24469
## 6               57063               17296                69945
##   FRT19A.CTRL.SE:COUNT DelMut.PE:COUNT DelMut.SE:COUNT PointMut.PE:COUNT
## 1                  429           22273            9938             20847
## 2                  618            5945            6395              5921
## 3                 1118           70047           17574             60339
## 4                 3622            9477            5765             10445
## 5                 9972          106540           39477            144444
## 6                22950           42344           16510             33547
##   PointMut.SE:COUNT
## 1              7833
## 2              4149
## 3             16304
## 4              8894
## 5             41589
## 6             13598
WriteXLS(x = count_out, 
         ExcelFileName = 'COUNT.TE.xlsx', row.names = F, SheetNames = 'sheet1', na = '-')

Merge with RNAseq results

df3_rnaseq <- readxl::read_xlsx('../../../RNAseq_201906/post_snakemake/Mac200/liberal/COUNT.xlsx', na="-") # row.name in cts(matrix)
colnames(df3_rnaseq) <- gsub(":COUNT", "", colnames(df3_rnaseq))
df3_rnaseq <- df3_rnaseq[, c("Gene","Name","Type","Length", 
                             "W1118.CTRL.PE", "W1118.CTRL.SE",
                             "FRT19A.CTRL.PE", "FRT19A.CTRL.SE",
                             "DelMut.PE", "DelMut.SE",
                             "PointMut.PE","PointMut.SE")]
colnames(df3_rnaseq)
##  [1] "Gene"           "Name"           "Type"           "Length"        
##  [5] "W1118.CTRL.PE"  "W1118.CTRL.SE"  "FRT19A.CTRL.PE" "FRT19A.CTRL.SE"
##  [9] "DelMut.PE"      "DelMut.SE"      "PointMut.PE"    "PointMut.SE"
anno_his <- subset(df3_rnaseq, Type == 'aggregated_multi-loci_gene', select=c(1,2,3,4))


df_TE <- readxl::read_xlsx('COUNT.TE.xlsx', na="-") # row.name in cts(matrix)
colnames(df_TE) <- gsub(":COUNT", "", colnames(df_TE))
colnames(df_TE)
##  [1] "Gene"           "Name"           "Type"           "Length"        
##  [5] "W1118.CTRL.PE"  "W1118.CTRL.SE"  "FRT19A.CTRL.PE" "FRT19A.CTRL.SE"
##  [9] "DelMut.PE"      "DelMut.SE"      "PointMut.PE"    "PointMut.SE"
colnames(df3_rnaseq) == colnames(df_TE)
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
df <- rbind(df_TE, df3_rnaseq)
head(df)
## # A tibble: 6 x 12
##   Gene  Name  Type  Length W1118.CTRL.PE W1118.CTRL.SE FRT19A.CTRL.PE
##   <chr> <chr> <chr>  <dbl>         <dbl>         <dbl>          <dbl>
## 1 FBgn… FBgn… dm6.…   7411          2640           533           2231
## 2 FBgn… FBgn… dm6.…   7439          3732          3853            845
## 3 FBgn… FBgn… dm6.…   4648          1413           686           2326
## 4 FBgn… FBgn… dm6.…   6995          7325         11389           6271
## 5 FBgn… FBgn… dm6.…   6126         10501          6263          24469
## 6 FBgn… FBgn… dm6.…   7567         57063         17296          69945
## # … with 5 more variables: FRT19A.CTRL.SE <dbl>, DelMut.PE <dbl>,
## #   DelMut.SE <dbl>, PointMut.PE <dbl>, PointMut.SE <dbl>
tail(df)
## # A tibble: 6 x 12
##   Gene  Name  Type  Length W1118.CTRL.PE W1118.CTRL.SE FRT19A.CTRL.PE
##   <chr> <chr> <chr>  <dbl>         <dbl>         <dbl>          <dbl>
## 1 FBgn… FUM1  prot…   1861          4178          4979           4178
## 2 HIS1  HIS1  aggr…  23529          5281           136           3999
## 3 HIS2A HIS2A aggr…   9843         10930           182          10757
## 4 HIS2B HIS2B aggr…  13685         15569           239          18376
## 5 HIS3  HIS3  aggr…  12167          3055             7           2825
## 6 HIS4  HIS4  aggr…   9543          4387           118           4316
## # … with 5 more variables: FRT19A.CTRL.SE <dbl>, DelMut.PE <dbl>,
## #   DelMut.SE <dbl>, PointMut.PE <dbl>, PointMut.SE <dbl>

get cts again

cts <- df[, 5:ncol(df)]
cts <- as.matrix(cts)
row.names(cts) <- df$Gene
colnames(cts)
## [1] "W1118.CTRL.PE"  "W1118.CTRL.SE"  "FRT19A.CTRL.PE" "FRT19A.CTRL.SE"
## [5] "DelMut.PE"      "DelMut.SE"      "PointMut.PE"    "PointMut.SE"
head(cts)
##                    W1118.CTRL.PE W1118.CTRL.SE FRT19A.CTRL.PE
## FBgn0026065_Idefix          2640           533           2231
## FBgn0000004_17.6            3732          3853            845
## FBgn0000007_1731            1413           686           2326
## FBgn0000005_297             7325         11389           6271
## FBgn0005384_3S18           10501          6263          24469
## FBgn0000006_412            57063         17296          69945
##                    FRT19A.CTRL.SE DelMut.PE DelMut.SE PointMut.PE
## FBgn0026065_Idefix            429     22273      9938       20847
## FBgn0000004_17.6              618      5945      6395        5921
## FBgn0000007_1731             1118     70047     17574       60339
## FBgn0000005_297              3622      9477      5765       10445
## FBgn0005384_3S18             9972    106540     39477      144444
## FBgn0000006_412             22950     42344     16510       33547
##                    PointMut.SE
## FBgn0026065_Idefix        7833
## FBgn0000004_17.6          4149
## FBgn0000007_1731         16304
## FBgn0000005_297           8894
## FBgn0005384_3S18         41589
## FBgn0000006_412          13598
paste("lib-size:")
## [1] "lib-size:"
colSums(cts)
##  W1118.CTRL.PE  W1118.CTRL.SE FRT19A.CTRL.PE FRT19A.CTRL.SE      DelMut.PE 
##       27757124       28607428       27461516       28984435       26569878 
##      DelMut.SE    PointMut.PE    PointMut.SE 
##       27853744       27904256       28454147

COUNT output

count <- data.frame(cts)
colnames(count) <- paste(colnames(count),"COUNT", sep = ":")
count_out <- merge(anno, count, by.x=1, by.y=0, all.y=T, sort=F)
head(count_out)
##          Gene           Name           Type Length W1118.CTRL.PE:COUNT
## 1 FBgn0000003 7SLRNA:CR32864          ncRNA    299              857775
## 2 FBgn0000008              a protein_coding   5414                 498
## 3 FBgn0000014          abd-A protein_coding   5477                 205
## 4 FBgn0000015          Abd-B protein_coding  11791                 371
## 5 FBgn0000017            Abl protein_coding  12819                 974
## 6 FBgn0000018            abo protein_coding   1794                 305
##   W1118.CTRL.SE:COUNT FRT19A.CTRL.PE:COUNT FRT19A.CTRL.SE:COUNT
## 1                  12               708677                   18
## 2                 250                  527                  465
## 3                 568                  176                  767
## 4                 757                  275                  384
## 5                1967                  946                 1359
## 6                 291                  362                  306
##   DelMut.PE:COUNT DelMut.SE:COUNT PointMut.PE:COUNT PointMut.SE:COUNT
## 1          749146              26            868286                21
## 2             481             183               473               211
## 3             134             424               144               404
## 4             171             253               123               300
## 5            1291            2001              1268              2324
## 6             247             264               229               252
WriteXLS(x = count_out, 
         ExcelFileName = 'COUNT.xlsx', row.names = F, SheetNames = 'sheet1', na = '-')

TPM calculation

tpm <- calculateTPM(cts, df$Length)
tpm <- data.frame(tpm)
colnames(tpm) <- paste(colnames(tpm),"TPM",  sep = ":")
tpm_out <- merge(anno, tpm, by.x=1, by.y=0, all.y=T, sort=F)
head(tpm_out)
##          Gene           Name           Type Length W1118.CTRL.PE:TPM
## 1 FBgn0000003 7SLRNA:CR32864          ncRNA    299      1.466959e+05
## 2 FBgn0000008              a protein_coding   5414      4.703561e+00
## 3 FBgn0000014          abd-A protein_coding   5477      1.913933e+00
## 4 FBgn0000015          Abd-B protein_coding  11791      1.608937e+00
## 5 FBgn0000017            Abl protein_coding  12819      3.885263e+00
## 6 FBgn0000018            abo protein_coding   1794      8.693468e+00
##   W1118.CTRL.SE:TPM FRT19A.CTRL.PE:TPM FRT19A.CTRL.SE:TPM DelMut.PE:TPM
## 1          2.788843       1.272681e+05           4.024877  1.448736e+05
## 2          3.208749       5.226789e+00           5.742302  5.137135e+00
## 3          7.206421       1.725490e+00           9.362761  1.414673e+00
## 4          4.461278       1.252347e+00           2.177368  8.385705e-01
## 5         10.662628       3.962594e+00           7.087885  5.823260e+00
## 6         11.271575       1.083499e+01          11.403819  7.961019e+00
##   DelMut.SE:TPM PointMut.PE:TPM PointMut.SE:TPM
## 1      6.483794    1.534252e+05        5.031000
## 2      2.520344    4.615810e+00        2.791711
## 3      5.772317    1.389072e+00        5.283781
## 4      1.599915    5.511369e-01        1.822539
## 5     11.639116    5.226009e+00       12.986381
## 6     10.972574    6.744008e+00       10.061999
tail(tpm_out)
##                      Gene                Name                       Type
## 16678 FBgn0063755_Osvaldo FBgn0063755_Osvaldo             dm6.transposon
## 16679                HIS1                HIS1 aggregated_multi-loci_gene
## 16680               HIS2A               HIS2A aggregated_multi-loci_gene
## 16681               HIS2B               HIS2B aggregated_multi-loci_gene
## 16682                HIS3                HIS3 aggregated_multi-loci_gene
## 16683                HIS4                HIS4 aggregated_multi-loci_gene
##       Length W1118.CTRL.PE:TPM W1118.CTRL.SE:TPM FRT19A.CTRL.PE:TPM
## 16678   1543          0.463957        1.57621759          0.3479979
## 16679  23529         11.477000        0.40165159          9.1262112
## 16680   9843         56.781694        1.28486635         58.6821834
## 16681  13685         58.174356        1.21357648         72.1022407
## 16682  12167         12.839360        0.03997869         12.4674468
## 16683   9543         23.507063        0.85923337         24.2850551
##       FRT19A.CTRL.SE:TPM DelMut.PE:TPM DelMut.SE:TPM PointMut.PE:TPM
## 16678         0.04332967    8.39414005    9.76140836      9.10795186
## 16679         0.47168921    0.01965987    0.06021118      0.09206306
## 16680         1.75244157   30.89956586    1.21204721     32.72069271
## 16681         1.15297138   31.18634810    1.59642991     31.42953597
## 16682         0.03297001    6.36342878    0.11643864      7.80748566
## 16683         1.24005135   17.55931525    0.73446308     20.87187391
##       PointMut.SE:TPM
## 16678     7.659919365
## 16679     0.009133221
## 16680     0.589472723
## 16681     1.177725006
## 16682     0.076536046
## 16683     0.105087072
WriteXLS(x = tpm_out, 
         ExcelFileName = 'TPM.xlsx', row.names = F, SheetNames = 'sheet1', na = '-')

FPKKM calculation

fpkm <- calculateFPKM(cts, df$Length)
fpkm <- data.frame(fpkm)
colnames(fpkm) <- paste(colnames(fpkm), "FPKM", sep = ":")
fpkm_out <- merge(anno, fpkm, by.x=1, by.y=0, all.y=T, sort=F)
head(fpkm_out)
##          Gene           Name           Type Length W1118.CTRL.PE:FPKM
## 1 FBgn0000003 7SLRNA:CR32864          ncRNA    299       1.033541e+05
## 2 FBgn0000008              a protein_coding   5414       3.313879e+00
## 3 FBgn0000014          abd-A protein_coding   5477       1.348456e+00
## 4 FBgn0000015          Abd-B protein_coding  11791       1.133571e+00
## 5 FBgn0000017            Abl protein_coding  12819       2.737350e+00
## 6 FBgn0000018            abo protein_coding   1794       6.124955e+00
##   W1118.CTRL.SE:FPKM FRT19A.CTRL.PE:FPKM FRT19A.CTRL.SE:FPKM
## 1           1.402915        86308.315631            2.077000
## 2           1.614146            3.544605            2.963261
## 3           3.625157            1.170160            4.831565
## 4           2.244225            0.849293            1.123610
## 5           5.363785            2.687277            3.657636
## 6           5.670113            7.347873            5.884833
##   DelMut.PE:FPKM DelMut.SE:FPKM PointMut.PE:FPKM PointMut.SE:FPKM
## 1   9.429870e+04      3.1218971     1.040689e+05        2.4683261
## 2   3.343777e+00      1.2135265     3.130923e+00        1.3696785
## 3   9.208152e-01      2.7793263     9.422135e-01        2.5923465
## 4   5.458281e-01      0.7703468     3.738385e-01        0.8941802
## 5   3.790378e+00      5.6041450     3.544824e+00        6.3714225
## 6   5.181851e+00      5.2832104     4.574489e+00        4.9366522
tail(fpkm_out)
##                      Gene                Name                       Type
## 16678 FBgn0063755_Osvaldo FBgn0063755_Osvaldo             dm6.transposon
## 16679                HIS1                HIS1 aggregated_multi-loci_gene
## 16680               HIS2A               HIS2A aggregated_multi-loci_gene
## 16681               HIS2B               HIS2B aggregated_multi-loci_gene
## 16682                HIS3                HIS3 aggregated_multi-loci_gene
## 16683                HIS4                HIS4 aggregated_multi-loci_gene
##       Length W1118.CTRL.PE:FPKM W1118.CTRL.SE:FPKM FRT19A.CTRL.PE:FPKM
## 16678   1543          0.3268795          0.7929089           0.2359987
## 16679  23529          8.0860837          0.2020490           6.1890419
## 16680   9843         40.0053626          0.6463460          39.7959773
## 16681  13685         40.9865583          0.6104840          48.8969389
## 16682  12167          9.0459304          0.0201111           8.4549382
## 16683   9543         16.5618270          0.4322333          16.4691810
##       FRT19A.CTRL.SE:FPKM DelMut.PE:FPKM DelMut.SE:FPKM PointMut.PE:FPKM
## 16678          0.02235987     5.46377154     4.70004335       6.17796243
## 16679          0.24341076     0.01279667     0.02899122       0.06244676
## 16680          0.90433092    20.11262234     0.58359145      22.19458483
## 16681          0.59497999    20.29928978     0.76866877      21.31878773
## 16682          0.01701386     4.14197534     0.05606431       5.29585068
## 16683          0.63991679    11.42941223     0.35363835      14.15748072
##       PointMut.SE:FPKM
## 16678      3.758135610
## 16679      0.004480972
## 16680      0.289209106
## 16681      0.577819435
## 16682      0.037550375
## 16683      0.051558176
WriteXLS(x = fpkm_out, 
         ExcelFileName = 'FPKM.xlsx', row.names = F, SheetNames = 'sheet1', na = '-')

DESeq Experiment Design (One Big Model)

  • for QC and comparison with seperated model only
sample <- factor(rep(c("W1118", "FRT19A", "DelMut","PointMut"), each=2))
sample
## [1] W1118    W1118    FRT19A   FRT19A   DelMut   DelMut   PointMut PointMut
## Levels: DelMut FRT19A PointMut W1118
batch <- factor(rep(c("PE", "SE"), 4))
batch
## [1] PE SE PE SE PE SE PE SE
## Levels: PE SE
type <- factor(paste(sample, batch, sep = "_"))
type
## [1] W1118_PE    W1118_SE    FRT19A_PE   FRT19A_SE   DelMut_PE   DelMut_SE  
## [7] PointMut_PE PointMut_SE
## 8 Levels: DelMut_PE DelMut_SE FRT19A_PE FRT19A_SE ... W1118_SE
#mouse <- factor(rep(c("Mouse1", "Mouse2", "Mouse3"), 6))
coldata <- data.frame(row.names=colnames(cts), 
                      sample
                      , batch 
                      , type
                      )
coldata
##                  sample batch        type
## W1118.CTRL.PE     W1118    PE    W1118_PE
## W1118.CTRL.SE     W1118    SE    W1118_SE
## FRT19A.CTRL.PE   FRT19A    PE   FRT19A_PE
## FRT19A.CTRL.SE   FRT19A    SE   FRT19A_SE
## DelMut.PE        DelMut    PE   DelMut_PE
## DelMut.SE        DelMut    SE   DelMut_SE
## PointMut.PE    PointMut    PE PointMut_PE
## PointMut.SE    PointMut    SE PointMut_SE

Model fitting

dds <- DESeqDataSetFromMatrix(countData = cts, 
                              colData = coldata, 
                              design = ~  batch + sample
                              )  # converted to alph-order
## converting counts to integer mode
# dds$type <- relevel(dds$type, 
#                     ref = "DMSO_2D"
#                     )
# dds$mouse <- relevel(dds$mouse, 
#                      ref = "Mouse1"
#                      )

dds
## class: DESeqDataSet 
## dim: 16683 8 
## metadata(1): version
## assays(1): counts
## rownames(16683): FBgn0026065_Idefix FBgn0000004_17.6 ... HIS3 HIS4
## rowData names(0):
## colnames(8): W1118.CTRL.PE W1118.CTRL.SE ... PointMut.PE
##   PointMut.SE
## colData names(3): sample batch type
dds <-DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
resultsNames(dds)
## [1] "Intercept"                 "batch_SE_vs_PE"           
## [3] "sample_FRT19A_vs_DelMut"   "sample_PointMut_vs_DelMut"
## [5] "sample_W1118_vs_DelMut"
saveRDS(dds, file = 'dds.oneBigModel.rds')

QC Plots

Data transformation

vsd <- vst(dds, blind=FALSE)

#rld <- rlog(dds, blind=FALSE)
counts <- counts(dds, normalized=0)
logCounts <- log10(counts +1 )

normed <- counts(dds, normalized=1)
logNormed <- log10(normed+1)

Histogram of Log10(Counts)

hist(logCounts, main='log10(count+1)')

Dispersion plot

plotDispEsts(dds, main="Dispersion plot")

PCA plots

pcaData <- plotPCA(vsd, intgroup=c("sample", "batch"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))

ggplot(pcaData, aes(PC1, PC2, color=batch, shape=sample)) +
  geom_point(size=3) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
  coord_fixed()

With label

library(ggplot2)
library(ggrepel)

pcaplot <- ggplot(pcaData, aes(PC1, PC2, color=batch, shape=sample)) +
    geom_point(size=3) +
    #geom_text(aes(label=type),hjust=1, vjust=0) +
    xlab(paste0("PC1: ",percentVar[1],"% variance")) +
    ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
    coord_fixed() +
    coord_fixed(ratio = percentVar[1]/percentVar[2]/2, 
                xlim = NULL, ylim = NULL, expand = T,
                clip = "off")
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
### geom_label_repel
pcaplot + 
  geom_label_repel(aes(label = type),
                  box.padding   = 0.35, 
                  point.padding = 0.5,
                  segment.color = 'grey50') +
  theme_classic()

Sample Heatmap

library("RColorBrewer")
library('pheatmap')
library("PoiClaClu")
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
poisd <- PoissonDistance(t(counts(dds)))
samplePoisDistMatrix <- as.matrix( poisd$dd ) 
rownames(samplePoisDistMatrix) <- paste( coldata$type, sep="-" ) 
colnames(samplePoisDistMatrix) <- NULL 
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(samplePoisDistMatrix,
         clustering_distance_rows=poisd$dd,
         clustering_distance_cols=poisd$dd,
         col=colors, 
         clustering_method='complete'
        )

Results from different contrasts

One Big Model (DelMut_vs_FRT19A)

#resultsNames(dds)
name <- "DelMut_vs_FRT19A"
contrast <- c('sample', 'DelMut', 'FRT19A')
res <- lfcShrink(dds, contrast = contrast, type = 'ashr')
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041
res2 <- results(dds, contrast = contrast)
process_deseq_res(res = res, res2=res2, name = name, anno = anno, norm_exp = tpm)
## [1] "DelMut_vs_FRT19A"
## [1] "\n>>> Summary using FDR cut-off only (LFC not used)"
## 
## out of 16683 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 1749, 10%
## LFC < 0 (down)     : 2134, 13%
## outliers [1]       : 0, 0%
## low counts [2]     : 3235, 19%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
## 
## [1] "\n>>> Summary using both FDR and LFC_shrinked cut-off"
## sig_idx
## FALSE  TRUE 
## 15170  1513 
## up_idx
## FALSE  TRUE 
## 15683  1000 
## down_idx
## FALSE  TRUE 
## 16170   513

One Big Model (PointMut_vs_FRT19A)

#resultsNames(dds)
name <- "PointMut_vs_FRT19A"
contrast <- c('sample', 'PointMut', 'FRT19A')
res <- lfcShrink(dds, contrast = contrast, type = 'ashr')
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041
res2 <- results(dds, contrast = contrast)
process_deseq_res(res = res, res2=res2, name = name, anno = anno, norm_exp = tpm)
## [1] "PointMut_vs_FRT19A"
## [1] "\n>>> Summary using FDR cut-off only (LFC not used)"
## 
## out of 16683 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 1736, 10%
## LFC < 0 (down)     : 1986, 12%
## outliers [1]       : 0, 0%
## low counts [2]     : 3235, 19%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
## 
## [1] "\n>>> Summary using both FDR and LFC_shrinked cut-off"
## sig_idx
## FALSE  TRUE 
## 15182  1501 
## up_idx
## FALSE  TRUE 
## 15607  1076 
## down_idx
## FALSE  TRUE 
## 16258   425

One Big Model (DelMut_vs_W1118)

#resultsNames(dds)
name <- "DelMut_vs_W1118"
contrast <- c('sample', 'DelMut', 'W1118')
res <- lfcShrink(dds, contrast = contrast, type = 'ashr')
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041
res2 <- results(dds, contrast = contrast)
process_deseq_res(res = res, res2=res2, name = name, anno = anno, norm_exp = tpm)
## [1] "DelMut_vs_W1118"
## [1] "\n>>> Summary using FDR cut-off only (LFC not used)"
## 
## out of 16683 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 1530, 9.2%
## LFC < 0 (down)     : 1797, 11%
## outliers [1]       : 0, 0%
## low counts [2]     : 3235, 19%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
## 
## [1] "\n>>> Summary using both FDR and LFC_shrinked cut-off"
## sig_idx
## FALSE  TRUE 
## 15571  1112 
## up_idx
## FALSE  TRUE 
## 15970   713 
## down_idx
## FALSE  TRUE 
## 16284   399

One Big Model (PointMut_vs_W1118)

#resultsNames(dds)
name <- "PointMut_vs_W1118"
contrast <- c('sample', 'PointMut', 'W1118')
res <- lfcShrink(dds, contrast = contrast, type = 'ashr')
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041
res2 <- results(dds, contrast = contrast)
process_deseq_res(res = res, res2=res2, name = name, anno = anno, norm_exp = tpm)
## [1] "PointMut_vs_W1118"
## [1] "\n>>> Summary using FDR cut-off only (LFC not used)"
## 
## out of 16683 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 1465, 8.8%
## LFC < 0 (down)     : 1727, 10%
## outliers [1]       : 0, 0%
## low counts [2]     : 3882, 23%
## (mean count < 9)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
## 
## [1] "\n>>> Summary using both FDR and LFC_shrinked cut-off"
## sig_idx
## FALSE  TRUE 
## 15597  1086 
## up_idx
## FALSE  TRUE 
## 15965   718 
## down_idx
## FALSE  TRUE 
## 16315   368